PRASFUR TIWARI
DIVYANSH TIWARI
SONAM KUMAR
Short for Corona Virus disease 2019, COVID-19 is an infectious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). It was first identified in December 2019 in Wuhan, China, and has since spread globally, resulting in an ongoing pandemic.
For more details about COVID-19, visit its wikipedia page from here.
In this project, we intent to analyse the effects of COVID-19 in India, before lockdown. We will be analyzing the total number of confirmed, cured and death cases. Our analysis will be based on 4 main factors:
Also, we will be predicting the possible number of infected cases if there was no/delayed lockdown, so as to estimate the success of lockdown.
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import folium
from geopy.geocoders import Nominatim
import requests
import warnings
warnings.filterwarnings('ignore')
#from pylab import rcParams
df_population=pd.read_csv('https://raw.githubusercontent.com/DivyT-03/Project/master/population_india_census2011.csv')
df_population.head()
df_main=pd.read_csv('https://raw.githubusercontent.com/DivyT-03/Project/master/covid_19_india(16.4.20).csv')
df_main.head()
df_main.dtypes
df=df_main[["Date","State/UnionTerritory","Cured","Deaths","Confirmed"]]
df.head()
state=df_population['State / Union Territory'].tolist()
state.sort()
print("Total states + union territories:",len(state))
dates=list(df.Date.unique())
print("Total days:",len(dates))
col=["Day","State_ID","Cured","Deaths","Confirmed"]
Date_ID=[]
State_ID=[]
Cured=[]
Deaths=[]
Confirmed=[]
for d_id,d in enumerate(dates,1):
for s_id,s in enumerate(state,1):
df_val=df[df["Date"]==d]
df_val=df_val[df_val["State/UnionTerritory"]==s]
Date_ID.append(d_id)
State_ID.append(s_id)
try:
Cu=df_val.Cured.to_list()[0]
De=df_val.Deaths.to_list()[0]
Co=df_val.Confirmed.to_list()[0]
except:
Cu=0
De=0
Co=0
Cured.append(Cu)
Deaths.append(De)
Confirmed.append(Co)
list_of_tuples = list(zip(Date_ID,State_ID,Cured,Deaths,Confirmed))
df_cv = pd.DataFrame(list_of_tuples, columns = col)
df_cv.head()
print("Our dataset has {} rows and {} columns".format(df_cv.shape[0],df_cv.shape[1]))
df_cv.isnull()
df_cv.isnull().sum()
import seaborn as sns
plt.rcParams['figure.figsize'] = 7, 5
sns.heatmap(df_cv.isnull(),yticklabels=False)
plt.show()
As there are no null values in our dataframe, we do not need to perform any clean-up.
df_cv.info()
df_cv.describe()
df_cv.dtypes
population=[]
for s in state:
df_val=df_population[df_population["State / Union Territory"]==s]
Po=df_val.Population.tolist()
population.append(Po)
list_of_tuples = list(zip(state,State_ID,population))
df_stateinfo = pd.DataFrame(list_of_tuples, columns = ["State","State ID", "Population"])
df_stateinfo.head()
df_cv.corr()
from scipy import stats
pearson_coef, p_value = stats.pearsonr(df_cv['Day'], df_cv['Confirmed'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
pearson_coef, p_value = stats.pearsonr(df_cv['Day'], df_cv['Cured'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
pearson_coef, p_value = stats.pearsonr(df_cv['Day'], df_cv['Deaths'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)
df_cv.head()
We take the data of only 56 days from the beginning, i.e. data till first lockdown, and keep the rest of the data for prediction.
df_cv1=df_cv[df_cv['Day']<57]
# Plotting the heatmap of the data set
plt.figure(figsize=(10,8))
sns.heatmap(df_cv1.corr(),linewidths=3)
plt.title('Heatmap')
plt.show()
# Relation between Confirmed Cases and Days
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Confirmed'],df_cv1['Day'],color='aqua')
plt.title('Confirmed Vs Days',size=20)
plt.show()
# Scatter Plot Between Confirmed Cases and Days
plt.figure(1,figsize=(20,10))
df_cv1.plot(kind='scatter',y='Confirmed',x='Day',alpha=.5)
plt.xlabel('Days',size=15)
plt.ylabel('Confirmed',size=15)
plt.title('Confirmed Vs Days Scatter Plot',size=20)
plt.show()
# Relation between Confirmed Cases and Cured Cases
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Confirmed'],df_cv1['Cured'],color='orange')
plt.title('Confirmed Vs Cured',size=20)
plt.show()
# Relation between Confirmed Cases and Deaths
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Confirmed'],df_cv1['Deaths'],color='Blue')
plt.title('Confirmed Vs Deaths',size=20)
plt.show()
# Relation between Cured Cases and Days
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Cured'],df_cv1['Day'],color='LightGreen')
plt.title('Cured Vs Days',size=20)
plt.show()
# Relation between Cured Cases and Deaths
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Cured'],df_cv1['Deaths'],color='Red')
plt.title('Cured Vs Deaths',size=20)
plt.show()
# Relation between Deaths and Days
plt.figure(1,figsize=(10,5))
sns.regplot(df_cv1['Deaths'],df_cv1['Day'],color='Blue')
plt.title('Days Vs Deaths',size=20)
plt.show()
df2=df_cv1.copy()
df2.drop(columns=["State_ID"],axis=1,inplace=True)
df2=df2.groupby(['Day']).sum()
df2.reset_index(inplace=True)
df2.head()
plt.figure(figsize=(35,20))
plt.bar(df2['Day'],df2['Confirmed'],align='center',alpha=0.5,color='purple')
plt.title("Rate vs Date",size=40)
plt.xlabel("Day",size=35)
plt.xticks(fontsize=20,fontname='monospace')
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
print("Confirmed cases:",df2['Confirmed'].max())
d3=df2.copy()
plt.figure(figsize=(35,20))
plt.bar(d3['Day'],d3['Cured'],align='center',alpha=0.5,color="Green")
plt.title("Rate vs Date",size=40)
plt.xlabel("Day",size=35)
plt.xticks(fontsize=20,fontname='monospace')
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
d4=df2.copy()
plt.figure(figsize=(35,20))
plt.bar(d4['Day'],d4['Deaths'],align='center',alpha=0.5,color="Red")
plt.title("Rate vs Date",size=40)
plt.xlabel("Day",size=35)
plt.xticks(fontsize=20,fontname='monospace')
plt.ylabel("Deaths",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
a=df2['Confirmed'].max()
b=df2['Cured'].max()
c=df2['Deaths'].max()
d=df2['Cured'].max()/df2['Confirmed'].max()*100
e=df2['Deaths'].max()/df2['Confirmed'].max()*100
A={"Feature":["Total Infected","Total Cured","Total Deaths","Cured %","Death %"],"Analysis":[a,b,c,d,e]}
Analysis=pd.DataFrame(A)
Analysis.Analysis=Analysis.Analysis.astype('int64')
Analysis.set_index(['Feature','Analysis'],inplace=True)
Analysis
df3=df_cv[df_cv['Day']<57]
df3.drop(columns=['Day'],axis=1,inplace=True)
df3=df3.groupby(['State_ID']).sum()
df3.reset_index(inplace=True)
df3['State_Name']=df_stateinfo['State']
df3.head()
d5=df3.copy()
plt.figure(figsize=(35,20))
plt.bar(df3['State_Name'],df3['Confirmed'],align='center',alpha=0.5,color="Blue")
plt.title("Case Count vs State",size=40)
plt.xlabel("State",size=35)
plt.xticks(fontsize=20,fontname='monospace',rotation=90)
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
d5.sort_values(["Confirmed"],axis=0,ascending=False,inplace=True)
print("Top 5 most infected states:\n")
d5[['State_Name','Confirmed']].head()
import plotly.graph_objects as go
fig1=go.Figure(data=go.Scatterpolar(r=d5['Confirmed'].head(),theta=d5['State_Name'].head(),fill='toself'))
fig1.update_layout(polar=dict(radialaxis=dict(visible=True),),showlegend=False)
fig1.show()
# Boxplot for confirmed cases per state
plt.figure(figsize=(7,5))
df3.boxplot(['Confirmed'],grid=True,fontsize=15,vert=False,showfliers=False)
plt.show()
d6=df3.copy()
plt.figure(figsize=(35,20))
plt.bar(df3['State_Name'],df3['Cured'],align='center',alpha=0.7,color="Orange")
plt.title("Cured vs State",size=40)
plt.xlabel("State",size=35)
plt.xticks(fontsize=20,fontname='monospace',rotation=90)
plt.ylabel("Cured",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
d6.sort_values(["Cured"],axis=0,ascending=False,inplace=True)
print("Top 5 states with most cured cases:\n")
d6[['State_Name','Cured']].head()
d7=df3.copy()
plt.figure(figsize=(35,20))
plt.bar(df3['State_Name'],df3['Deaths'],align='center',alpha=0.9,color="Red")
plt.title("Deaths vs State",size=40)
plt.xlabel("State",size=35)
plt.xticks(rotation=90,fontsize=20,fontname='monospace')
plt.ylabel("Deaths",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
d7.sort_values(["Deaths"],axis=0,ascending=False,inplace=True)
print("Top 5 states with most deaths:\n")
d7[['State_Name','Deaths']].head()
# Pie chart depiction of deaths in top 5 states
plt.figure()
fig,ax=plt.subplots(figsize=(7,7))
plt.subplots_adjust(left=0.5,wspace=0.2)
ax.pie(d7['Deaths'].head(),explode=(0.1,0,0,0,0),labels=d7['State_Name'].head(),autopct='%1.1f%%',shadow=True,startangle=90)
ax.axis('equal')
plt.legend(d7['State_Name'].head(),loc="best")
plt.tight_layout()
plt.show()
only_state=df3.copy()
only_state.drop([0,5,7,17,18,26,31],inplace=True)
lat=[]
lng=[]
geolocator=Nominatim(user_agent='foursquare_api')
for nm in only_state['State_Name']:
location=geolocator.geocode(str(nm),timeout=10)
lat.append(location.latitude)
lng.append(location.longitude)
only_state['Latitude']=lat
only_state['Longitude']=lng
wm=folium.Map(zoom_start=5,location=[only_state['Latitude'].mean(),only_state['Longitude'].mean()])
mp=folium.map.FeatureGroup()
for i,j,k,l,m in zip(only_state['Latitude'],only_state['Longitude'],only_state['Confirmed'],only_state['Cured'],only_state['Deaths']):
mp.add_child(folium.CircleMarker(location=[i,j],radius=5,color='red',fill_color='Yellow'))
folium.Marker([i,j],popup='Confirmed:'+str(k)+'\nCured:'+str(l)+"\nDeaths:"+str(m)).add_to(mp)
wm.add_child(mp)
wm
pop=[]
for p in range(0,len(df_stateinfo)):
pop.append(df_stateinfo['Population'][p][0])
df_pop=df3.copy()
df_pop['Population']=pop
df_pop.head()
most=df_pop.copy()
most.sort_values(["Population"],axis=0,ascending=False,inplace=True)
plt.figure(figsize=(35,20))
plt.bar(most['State_Name'].head(10),most['Population'].head(10),align='center',alpha=0.9,color="purple")
plt.title("Population vs State",size=40)
plt.xlabel("State",size=35)
plt.xticks(fontsize=20,fontname='monospace')
plt.ylabel("Population",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
print("Top 5 most populated states:")
most[['State_Name','Population']].head()
most['Confirmed %']=round((most['Confirmed']/most['Population'])*100,9)
most['Cured %']=round(most['Cured']/most['Population']*100,9)
most['Death %']=round(most['Deaths']/most['Population']*100,9)
most.head()
# Relation between Population and Confirmed
most1=most.copy()
plt.figure(1,figsize=(10,5))
sns.regplot(most1['Population'],most1['Confirmed'],color='brown')
plt.title('Population Vs Confirmed',size=20)
plt.show()
most1.sort_values(["Confirmed %"],axis=0,ascending=False,inplace=True)
print("States with most affected population: ")
most1[['State_Name','Confirmed','Confirmed %']].head()
# Relation between Population and Cured
most2=most.copy()
plt.figure(1,figsize=(10,5))
sns.regplot(most2['Population'],most2['Cured'],color='magenta')
plt.title('Population Vs Confirmed',size=20)
plt.show()
most2.sort_values(["Cured %"],axis=0,ascending=False,inplace=True)
print("States with most cured population: ")
most2[['State_Name','Cured','Cured %']].head()
# Relation between Population and Deaths
most3=most.copy()
plt.figure(1,figsize=(10,5))
sns.regplot(most3['Population'],most3['Deaths'],color='black')
plt.title('Population Vs Deaths',size=20)
plt.show()
most3.sort_values(["Death %"],axis=0,ascending=False,inplace=True)
print("States with most deaths: ")
most3[['State_Name','Deaths','Death %']].head()
This data is of starting 60 days, i.e., till 28-03-2020.
age=pd.read_csv('https://raw.githubusercontent.com/DivyT-03/Project/master/AgeGroupDetails.csv')
age
age.drop([9],inplace=True)
# Boxplot for cases per age-group
plt.figure(figsize=(7,5))
age.boxplot(['TotalCases'],grid=False,fontsize=15)
plt.show()
plt.figure(figsize=(35,20))
plt.bar(age['AgeGroup'],age['TotalCases'],align='center',alpha=0.9)
plt.title("Age-Group Vs Cases",size=40)
plt.xlabel("Age-Group",size=35)
plt.xticks(fontsize=20,fontname='monospace',rotation=90)
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
age1=age.copy()
age1.sort_values(["TotalCases"],axis=0,ascending=False,inplace=True)
age1.drop(columns={'Sno'},inplace=True)
age1.set_index(['AgeGroup'],inplace=True)
print("Mostly affected age-groups are:")
age1.head(2)
As per the plots we obtained above, we intent to use the exponential regression model to predict the infected, cured and death rates.
df2.hist(['Confirmed','Cured','Deaths'])
plt.show()
import numpy as np
msk=np.random.rand(len(df2))<.75
train=df2[msk]
test=df2[~msk]
train_x=train['Day'].values
test_x=test['Day'].values
train_y1=train['Confirmed'].values
test_y1=test['Confirmed'].values
train_y2=train['Cured'].values
test_y2=test['Cured'].values
train_y3=train['Deaths'].values
test_y3=test['Deaths'].values
def sigmoid(x,a,b):
y=a*np.exp(b*x)
return y
'''This function has a particular limit, hence we find the best parameters only limited number of times. Hence this code is a comment'''
from scipy.optimize import curve_fit
#popt1,pcov1=curve_fit(sigmoid,train_x,train_y1)
#popt2,pcov2=curve_fit(sigmoid,train_x,train_y2)
popt3,pcov3=curve_fit(sigmoid,train_x,train_y3)
a1=0.024636474551803556
b1=0.1806252301712788
a2=0.006491019205331347
b2=0.15694857422845598
a3=-8.509657270186663e-16
b3=0.9999999967317126
print("Optimized parameters are:")
print("\nFor Confirmed:")
print("a:",a1)
print("b:",b1)
print("\nFor Cured:")
print("a:",a2)
print("b:",b2)
print("\nFor Deaths:")
print("a:",popt3[0])
print("b:",popt3[1])
pred1=sigmoid(test_x,a1,b1)
pred2=sigmoid(test_x,a2,b2)
pred3=sigmoid(test_x,popt3[0],popt3[1])
plt.figure(figsize=(7,7))
plt.plot(test_x,pred1)
plt.plot(test_x,test_y1)
plt.show()
pred1
from sklearn.metrics import r2_score
r1=r2_score(pred1,test_y1)
rss1=round(np.mean((pred1-test['Confirmed'])**2),3)
mae1=round(np.mean(np.absolute(pred1-test['Confirmed'])),3)
print("For Confirmed:\n")
print("R-2 Score:",round(r1*100,2),'%')
print("Residual Sum of Squares:",rss1)
print("Mean Absolute Error:",mae1)
plt.figure(figsize=(7,7))
plt.plot(test_x,pred2)
plt.plot(test_x,test_y2)
plt.show()
pred2
r2=r2_score(pred2,test_y2)
rss2=round(np.mean((pred2-test['Cured'])**2),3)
mae2=round(np.mean(np.absolute(pred2-test['Cured'])),3)
print("For Cured:\n")
print("R-2 Score:",round(r2*100,2),'%')
print("Residual Sum of Squares:",rss2)
print("Mean Absolute Error:",mae2)
plt.figure(figsize=(7,7))
plt.plot(test_x,pred3*(-1))
plt.plot(test_x,test_y3*(2900000))
plt.show()
pred3*(-2900000)
r3=r2_score(pred3,test_y3*(-2900000))
rss3=round(np.mean((pred3-test['Deaths'])**2),3)
mae3=round(np.mean(np.absolute(pred3-test['Deaths'])),3)
print("For Deaths:\n")
print("R-2 Score:",round(r3*100,2),'%')
print("Residual Sum of Squares:",rss3)
print("Mean Absolute Error:",mae3)
summ={'Factor':['Confirmed','Cured','Deaths'],'R2 %':[round(r1*100,2),round(r2*100,2),round(r3*100,2)]}
rep=pd.DataFrame(summ)
rep.set_index(['Factor','R2 %'],inplace=True)
rep
We will be continuing to predict the confirmed cases post lock-down and see the results.
Since we got a good percentage of confirmed and cured rates, we will be predicting cases only for these two factors.
post_ld=[]
for x in range(57,109):
post_ld.append(int(sigmoid(x,a1,b1)))
df_cv2=df_cv[df_cv['Day']>=57]
df_cv2.drop(columns={'State_ID'},inplace=True)
df_cv2=df_cv2.groupby(['Day']).sum()
df_cv2.reset_index(inplace=True)
df_preds=df_cv2[['Day','Confirmed']]
df_preds['Predicted']=post_ld
df_preds.head()
plt.figure(figsize=(35,20))
plt.bar(df_preds['Day'],df_preds['Confirmed'],align='center',alpha=0.9)
plt.title("Confirmed Cases Vs Days",size=40)
plt.xlabel("Days (From 57th Day)",size=35)
plt.xticks(fontsize=20,fontname='monospace',rotation=90)
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
plt.figure(figsize=(35,20))
plt.bar(df_preds['Day'],df_preds['Predicted'],align='center',alpha=0.9,color='red')
plt.title("Predicted Confirmed Cases Vs Days",size=40)
plt.xlabel("Days (From 57th Day)",size=35)
plt.xticks(fontsize=20,fontname='monospace',rotation=90)
plt.ylabel("Cases",size=35)
plt.yticks(fontsize=30,fontname='monospace')
plt.show()
plt.plot(df_preds['Day'],df_preds['Confirmed'])
plt.plot(df_preds['Day'],df_preds['Predicted'])
plt.show()
df_preds.hist(['Confirmed','Predicted'])
plt.show()
print("Maximum cases during lockdown:",df_preds['Confirmed'].max())
print("Maximum cases if there was no lockdown:",df_preds['Predicted'].max())
print("Percentage Population Saved:",round((df_preds['Predicted'].max()-df_preds['Confirmed'].max())/df_preds['Predicted'].max()*100,2),"%")
Our estimations show that due to lockdown, 98.83% of India's population was saved from being infected by COVID-19. This clearly signifies that lockdown was successful in its implementation, and proves out to be a correct decision, obviously!